import os
import json
from openai import AzureOpenAI
from tqdm import tqdm
import sys
import re
import base64
import cv2
from PIL import Image
from io import BytesIO
import random
import pandas as pd
import numpy as np
import argparse

# CLI arguments (start/end slice & output dir)
parser = argparse.ArgumentParser()
parser.add_argument("--start", type=int, default=0, help="Start index (inclusive)")
parser.add_argument("--end", type=int, default=None, help="End index (inclusive)")
parser.add_argument("--output_dir", type=str, default=".", help="Directory to store JSON results")
parser.add_argument("--batch_size", type=int, default=8, help="Save frequency")
parser.add_argument("--similarity_json", type=str, default=None, help="Path to similarity JSON file (optional)")
args = parser.parse_args()


persona_prompts = {
    "18-24_female": """You are a woman aged 18–24. You're fluent in digital aesthetics, raised on platforms like TikTok and Instagram. You notice instantly if something has a vibe—bold colors, expressive fonts, emotional tone, or modern, fun design. Websites that are cluttered, generic, or try-hard are less likely to appeal to you.

You are given 5 example website screenshots and how much everyone liked them (on a 0–10 scale). You're now shown a new website screenshot. Your task is to judge how much you **like** this website based on its visual design, layout, color scheme, and content.

Return your response in this exact format:
Answer: [0–10] ← You must include this numerical score.
Reason: [Why this website does or doesn't appeal to you visually and emotionally in minimal words]""",

    "18-24_male": """You are a man aged 18–24. You're used to fast-scroll content and visual punch—memes, Twitch, TikTok, YouTube. You like websites that grab attention fast: bold layouts, smart design, or a bit of edge. If a website feels outdated, cluttered, or boring, it loses your interest quickly.

You are given 5 example website screenshots and how much everyone liked them (on a 0–10 scale). You're now shown a new website screenshot. Your task is to judge how much you **like** this website based on visuals, usability, and vibe.

Return your response in this exact format:
Answer: [0–10] ← You must include this numerical score.
Reason: [Why this website does or doesn't appeal to you visually and emotionally in minimal words]""",

    "25-34_female": """You are a woman aged 25–34. You appreciate modern, polished websites that feel aligned with your lifestyle—whether it's wellness, creativity, relationships, or career. You like clean layouts, elegant color palettes, and visuals that are both pretty and purposeful.

You are given 5 example website screenshots and how much everyone liked them (on a 0–10 scale). You're now shown a new website screenshot. Your task is to judge how much you **like** this website based on design, clarity, aesthetics, and content.

Return your response in this exact format:
Answer: [0–10] ← You must include this numerical score.
Reason: [Why this website does or doesn't appeal to you visually and emotionally in minimal words]""",

    "25-34_male": """You are a man aged 25–34. You value strong, clear, and modern visuals. You're likely to appreciate websites that are bold but not messy—clean grids, high contrast, sharp fonts, and relevant content (fitness, tech, ambition, money).

You are given 5 example website screenshots and how much everyone liked them (on a 0–10 scale). You're now shown a new website screenshot. Your task is to judge how much you **like** this website based on its layout, visual punch, and message.

Return your response in this exact format:
Answer: [0–10] ← You must include this numerical score.
Reason: [Why this website does or doesn't appeal to you visually and emotionally in minimal words]""",

    "35-44_female": """You are a woman aged 35–44. You're drawn to websites that are intentional, emotionally intelligent, and visually clean. Family, meaning, and beauty in simplicity appeal to you more than trend-driven clutter.

You are given 5 example website screenshots and how much everyone liked them (on a 0–10 scale). You're now shown a new website screenshot. Your task is to judge how much you **like** this website based on its design, clarity, and emotional tone.

Return your response in this exact format:
Answer: [0–10] ← You must include this numerical score.
Reason: [Why this website does or doesn't appeal to you visually and emotionally in minimal words]""",

    "35-44_male": """You are a man aged 35–44. You like websites that are grounded, practical, and cleanly designed. Strong layouts, good use of space, and purpose-driven content grab your attention more than visual noise.

You are given 5 example website screenshots and how much everyone liked them (on a 0–10 scale). You're now shown a new website screenshot. Your task is to judge how much you **like** this website based on structure, relevance, and visual balance.

Return your response in this exact format:
Answer: [0–10] ← You must include this numerical score.
Reason: [Why this website does or doesn't appeal to you visually and emotionally in minimal words]""",

    "45-54_female": """You are a woman aged 45–54. You like websites that are calm, clear, and visually composed. Design that feels warm, thoughtful, and emotionally grounded appeals more than flashy visuals or trendy noise.

You are given 5 example website screenshots and how much everyone liked them (on a 0–10 scale). You're now shown a new website screenshot. Your task is to judge how much you **like** this website based on clarity, emotional tone, and visual presentation.

Return your response in this exact format:
Answer: [0–10] ← You must include this numerical score.
Reason: [Why this website does or doesn't appeal to you visually and emotionally in minimal words]""",

    "45-54_male": """You are a man aged 45–54. You prefer websites that are easy to navigate, focused, and visually grounded. You're drawn to sites that reflect purpose and clarity over trend or flash.

You are given 5 example website screenshots and how much everyone liked them (on a 0–10 scale). You're now shown a new website screenshot. Your task is to judge how much you **like** this website based on usability, structure, and message.

Return your response in this exact format:
Answer: [0–10] ← You must include this numerical score.
Reason: [Why this website does or doesn't appeal to you visually and emotionally in minimal words]""",

    "55+_female": """You are a woman aged 55 or older. You appreciate websites that feel meaningful, visually calm, and easy to understand. Gentle color palettes, clear fonts, and emotionally warm content make a big difference.

You are given 5 example website screenshots and how much everyone liked them (on a 0–10 scale). You're now shown a new website screenshot. Your task is to judge how much you **like** this website based on design simplicity and emotional tone.

Return your response in this exact format:
Answer: [0–10] ← You must include this numerical score.
Reason: [Why this website does or doesn't appeal to you visually and emotionally in minimal words]""",

    "55+_male": """You are a man aged 55 or older. You value websites that are straightforward, honest, and easy to engage with. Flashy or cluttered pages can feel frustrating, while clear structure and meaningful content feel worthwhile.

You are given 5 example website screenshots and how much everyone liked them (on a 0–10 scale). You're now shown a new website screenshot. Your task is to judge how much you **like** this website based on clarity, usefulness, and visual comfort.

Return your response in this exact format:
Answer: [0–10] ← You must include this numerical score.
Reason: [Why this website does or doesn't appeal to you visually and emotionally in minimal words]"""
}

def frame_to_data_url(frame_bgr):
    # Convert the BGR frame (OpenCV format) to RGB
    frame_rgb = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)

    # Convert the RGB frame to a PIL Image
    image = Image.fromarray(frame_rgb)
    image = image.resize((256, 256), Image.LANCZOS)
    # Create a BytesIO buffer to hold the image data
    buffered = BytesIO()
    image.save(buffered, format="JPEG")
    buffered.seek(0)

    # Encode the image data in base64
    base64_encoded_data = base64.b64encode(buffered.read()).decode('utf-8')

    # Construct the data URL
    return f"data:image/jpeg;base64,{base64_encoded_data}"

# Azure OpenAI Configuration
api_version = "2024-02-15-preview"
config_dict = {
    'api_key': "YOUR_OPENAI_API_KEY",
    'api_version': api_version,
    'azure_endpoint': "https://your-azure-openai-endpoint/"
}

def create_persona_system_prompt(persona_specification):
    """Create a system prompt based on the agent's persona specification"""
    # Extract the first sentence as the short description
    first_sentence = persona_specification.split('.')[0] + '.'
    
    # System prompt
    return f"""You are {first_sentence}
    
    {persona_specification}
    
    You are evaluating website aesthetics and design quality. Your task is to judge how much you **like** this website based on its visual design, layout, color scheme, and content, considering your unique background, personality, and preferences.
    
    You can provide precise scores including decimal values (e.g., 7.5, 8.2) to better reflect your nuanced judgment.
    
    You may be given up to 5 example website screenshots and how much everyone liked them (on a 0–10 scale). You're now shown a new website screenshot.
    
    Return:
    Reason: [Explain your reaction based on your background and preferences]
    Answer: [0–10] ← You must include this score."""

def get_json_data_generate(sys_prompt, user_prompt, images):
    # images: list of (data_url, score) tuples, last one is the target
    # Build the message with all images
    user_content = [{"type": "text", "text": user_prompt}]
    for idx, (img_url, score) in enumerate(images):
        if idx < len(images) - 1:
            # Example images
            user_content.append({
                "type": "image_url",
                "image_url": {"url": img_url, "detail": "low"},
                "score": f"{score:.2f}"
            })
        else:
            # The image to be scored
            user_content.append({
                "type": "image_url",
                "image_url": {"url": img_url, "detail": "high"}
            })
    return {
        "messages": [
            {"role": "system", "content": sys_prompt},
            {"role": "user", "content": user_content}
        ]
    }

def verbalize(prompt, sys_prompt, images):
    json_data = get_json_data_generate(sys_prompt, prompt, images)
    client = AzureOpenAI(
        api_key=config_dict['api_key'],
        api_version=config_dict['api_version'],
        azure_endpoint=config_dict['azure_endpoint'],
    )
    response = client.chat.completions.create(
        model='gpt-4o',
        messages=json_data["messages"],
        max_tokens=350,
        temperature=0.85,
        n=1
    )
    return response.choices[0].message.content.strip()

# Load personas from CSV
personas_filename = "/path/to/shortlisted_agents_train.csv"
personas_df = pd.read_csv(personas_filename)

# Create system prompts for each agent
agents = {}
for _, row in personas_df.iterrows():
    agent_id = row['agent_id']
    specification = row['specification']
    agents[agent_id] = {
        'specification': specification,
        'system_prompt': create_persona_system_prompt(specification)
    }

print(f"Loaded {len(agents)} persona agents: {list(agents.keys())}")

# Load similarity data
if args.similarity_json is not None:
    SIM_JSON_PATH = args.similarity_json
else:
    SIM_JSON_PATH = os.path.join(os.path.dirname(__file__), "similaritywebaes.json")

if os.path.exists(SIM_JSON_PATH):
    try:
        with open(SIM_JSON_PATH, "r") as f:
            SIMILARITY_DATA = json.load(f)
    except Exception:
        SIMILARITY_DATA = {}
else:
    SIMILARITY_DATA = {}

# Load test data
test_filename = "/path/to/test_list.csv"
df = pd.read_csv(test_filename)

# Determine slice indices
if args.end is not None:
    sample_indices = list(range(args.start, min(args.end + 1, len(df))))
else:
    sample_indices = list(range(args.start, len(df)))

os.makedirs(args.output_dir, exist_ok=True)

# Main loop
response_dict = []
for idx_counter, i in enumerate(tqdm(sample_indices)):
    try:
        d = df.iloc[i]
        value = d.to_dict()
        image_path = '/path/to/images/'+d['image'].replace('_resized','')
        image = cv2.imread(image_path)
        image_url = frame_to_data_url(image)
        
        # Similarity-based retrieval first
        example_indices = []
        similar_list = SIMILARITY_DATA.get(str(i), {}).get("similar_images", [])
        for sim in similar_list:
            if len(example_indices) >= 5:
                break
            fname = sim.get("image", "")
            if fname:
                j = None
                try:
                    j = df.index[df['image'] == fname][0]
                except Exception:
                    j = None
                if j is not None and j != i and j not in example_indices:
                    example_indices.append(j)

        # Add random examples
        if len(example_indices) < 5:
            other_indices = list(range(df.shape[0]))
            other_indices.remove(i)
            # Remove already selected to avoid duplicates
            for j in example_indices:
                if j in other_indices:
                    other_indices.remove(j)
            random.shuffle(other_indices)
            needed = 5 - len(example_indices)
            example_indices.extend(other_indices[:needed])
        
        # Now load the selected examples
        example_lines = []
        example_images = []
        for idx in example_indices:
            try:
                row = df.iloc[idx]
                fname = row['image']
                score = row['mean_score']
                img_path = '/path/to/images/'+fname.replace('_resized','')
                img = cv2.imread(img_path)
                if img is None:
                    continue
                img_url = frame_to_data_url(img)
                if img_url is None:
                    continue
                example_lines.append(f"Score: {score:.1f}")
                example_images.append((img_url, score))
            except:
                continue
        
        # Add the current image as the last one
        example_images.append((image_url, None))
        examples_text = "\n".join(example_lines)
        
        num_examples = len(example_lines)
        if num_examples > 0:
            prompt = f"""Given the images below, the first {num_examples} are example website screenshots with their likeability scores (on a 0-10 scale, see the list below). The last image is the one you should score. Based on your background and preferences, carefully consider the last image and give a score between 0 to 10 based on how much you like the website's visual design, layout, colors, and content.
        
Here are {num_examples} example likeability scores (in order):
{examples_text}"""
        else:
            prompt = """Based on your background and preferences, carefully consider the website screenshot below and give a score between 0 to 10 based on how much you like the website's visual design, layout, colors, and content."""
        
        # Get predictions from all agents
        agent_responses = {}
        predictions = []
        
        for agent_id, agent_info in agents.items():
            try:
                resp = verbalize(prompt, agent_info['system_prompt'], example_images)
                # Updated regex to capture decimal values
                number_matches = re.findall(r'\b\d+(?:\.\d+)?\b', resp)

                # If we have any matches, take the last number
                if number_matches:
                    prediction = float(number_matches[-1])  # Get the last number found (can be decimal)
                else:
                    prediction = None
                
                agent_responses[agent_id] = {
                    "prompt": prompt, 
                    "prediction": prediction, 
                    "reason": resp
                }
                
                if prediction is not None:
                    predictions.append(prediction)
                    
            except Exception as e:
                print(f"Error with agent {agent_id}: {e}")
                agent_responses[agent_id] = {
                    "prompt": prompt, 
                    "prediction": None, 
                    "reason": f"Error: {str(e)}"
                }
        
        # Calculate mean prediction
        if predictions:
            mean_prediction = np.mean(predictions)
        else:
            mean_prediction = None
        
        # Store results
        value.update({
            "agent_responses": agent_responses,
            "mean_prediction": mean_prediction,
            "ground_truth": d['mean_score'],
            "valid_predictions_count": len(predictions),
            "total_agents": len(agents)
        })
        response_dict.append(value)

        # Save results after each iteration
        slice_tag=f"{args.start}_{args.end if args.end is not None else 'end'}"
        out_path=os.path.join(args.output_dir, f"results_gpt4o_persona_static_web_aes_ten_slice_{slice_tag}.json")
        with open(out_path, 'w') as f:
            json.dump(response_dict, f, indent=4)
    
    except Exception as e:
        print(f"Error processing row {i}: {e}")
        continue

print(f"Evaluation completed. Processed {len(response_dict)} samples → {out_path}") 